{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3",
   "language": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "source": [
    "# I - Consolidations of datasets"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "   timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n0         28800001       28802000        0.50        0.77        3.13   \n1         28802001       28804000        1.00        0.67        3.32   \n2         28804001       28806000        1.00        0.59        3.77   \n3         28806001       28808000        0.50        0.00        3.12   \n4         28808001       28810000        0.67        1.16        3.11   \n\n   kSQI_score  pSQI_score  basSQI_score  classif  classif_avg  patient  \n0       11.74        0.51          0.98        1          1.0   103001  \n1       13.03        0.52          0.97        1          1.0   103001  \n2       16.35        0.51          0.97        1          1.0   103001  \n3       12.99        0.53          0.97        1          1.0   103001  \n4       11.62        0.52          0.95        1          1.0   103001  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>patient</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>28800001</td>\n      <td>28802000</td>\n      <td>0.50</td>\n      <td>0.77</td>\n      <td>3.13</td>\n      <td>11.74</td>\n      <td>0.51</td>\n      <td>0.98</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>28802001</td>\n      <td>28804000</td>\n      <td>1.00</td>\n      <td>0.67</td>\n      <td>3.32</td>\n      <td>13.03</td>\n      <td>0.52</td>\n      <td>0.97</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>28804001</td>\n      <td>28806000</td>\n      <td>1.00</td>\n      <td>0.59</td>\n      <td>3.77</td>\n      <td>16.35</td>\n      <td>0.51</td>\n      <td>0.97</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>28806001</td>\n      <td>28808000</td>\n      <td>0.50</td>\n      <td>0.00</td>\n      <td>3.12</td>\n      <td>12.99</td>\n      <td>0.53</td>\n      <td>0.97</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>28808001</td>\n      <td>28810000</td>\n      <td>0.67</td>\n      <td>1.16</td>\n      <td>3.11</td>\n      <td>11.62</td>\n      <td>0.52</td>\n      <td>0.95</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient count by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "         classif\npatient         \n103001      1200\n111001     45321\n113001      1800\n124001      2400",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>1200</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>45321</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>1800</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>2400</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient average by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "          classif  classif_avg\npatient                       \n103001   0.554167     0.723268\n111001   0.348668     0.419445\n113001   0.525556     0.661498\n124001   0.115000     0.286503",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n      <th>classif_avg</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>0.554167</td>\n      <td>0.723268</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>0.348668</td>\n      <td>0.419445</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>0.525556</td>\n      <td>0.661498</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>0.115000</td>\n      <td>0.286503</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "patients = [103001, 111001, 113001, 124001]\n",
    "\n",
    "df_ml_conso = pd.read_csv('../../datasets/2_dataset_creation_2s/df_ml_{}_norm.csv'.format(patients[0]))\n",
    "df_ml_conso['patient'] = patients[0]\n",
    "\n",
    "for patient in patients[1:]:\n",
    "    df_temp = pd.read_csv('../../datasets/2_dataset_creation_2s/df_ml_{}_norm.csv'.format(patient))\n",
    "    df_temp['patient'] = patient\n",
    "    df_ml_conso = pd.concat([df_ml_conso,df_temp], axis=0)\n",
    "\n",
    "display(df_ml_conso.head())\n",
    "\n",
    "print('\\nPatient count by classification:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif'], index='patient',aggfunc='count'))\n",
    "\n",
    "print('\\nPatient average by classification:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif', 'classif_avg'], index='patient',aggfunc=np.mean))"
   ]
  },
  {
   "source": [
    "# II - Appying threshold of quality"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient count:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "         classif\npatient         \n103001      1200\n111001     45321\n113001      1800\n124001      2400",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>1200</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>45321</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>1800</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>2400</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient average by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "          classif  classif_avg  classif_threshold\npatient                                          \n103001   0.554167     0.723268           0.560000\n111001   0.348668     0.419445           0.353214\n113001   0.525556     0.661498           0.535556\n124001   0.115000     0.286503           0.127917",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>classif_threshold</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>0.554167</td>\n      <td>0.723268</td>\n      <td>0.560000</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>0.348668</td>\n      <td>0.419445</td>\n      <td>0.353214</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>0.525556</td>\n      <td>0.661498</td>\n      <td>0.535556</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>0.115000</td>\n      <td>0.286503</td>\n      <td>0.127917</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "# Setting a threshold : proportion of optimal by observation\n",
    "classif_threshold = 0.95\n",
    "\n",
    "df_ml_conso['classif_threshold'] = df_ml_conso['classif_avg'].apply(lambda x: 1 if x >= classif_threshold else 0)\n",
    "\n",
    "print('\\nPatient count:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif'], index='patient',aggfunc='count'))\n",
    "\n",
    "print('\\nPatient average by classification:')\n",
    "display(pd.pivot_table(df_ml_conso, values=['classif', 'classif_avg', 'classif_threshold'], index='patient',aggfunc=np.mean))"
   ]
  },
  {
   "source": [
    "# III - Creation of a validation dataset"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ml_conso_for_model, df_ml_conso_validation = train_test_split(df_ml_conso, test_size=0.2, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "   timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n0         28800001       28802000        0.50        0.77        3.13   \n1         28802001       28804000        1.00        0.67        3.32   \n2         28804001       28806000        1.00        0.59        3.77   \n3         28806001       28808000        0.50        0.00        3.12   \n4         28808001       28810000        0.67        1.16        3.11   \n\n   kSQI_score  pSQI_score  basSQI_score  classif  classif_avg  patient  \\\n0       11.74        0.51          0.98        1          1.0   103001   \n1       13.03        0.52          0.97        1          1.0   103001   \n2       16.35        0.51          0.97        1          1.0   103001   \n3       12.99        0.53          0.97        1          1.0   103001   \n4       11.62        0.52          0.95        1          1.0   103001   \n\n   classif_threshold  \n0                  1  \n1                  1  \n2                  1  \n3                  1  \n4                  1  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>patient</th>\n      <th>classif_threshold</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>28800001</td>\n      <td>28802000</td>\n      <td>0.50</td>\n      <td>0.77</td>\n      <td>3.13</td>\n      <td>11.74</td>\n      <td>0.51</td>\n      <td>0.98</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>28802001</td>\n      <td>28804000</td>\n      <td>1.00</td>\n      <td>0.67</td>\n      <td>3.32</td>\n      <td>13.03</td>\n      <td>0.52</td>\n      <td>0.97</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>28804001</td>\n      <td>28806000</td>\n      <td>1.00</td>\n      <td>0.59</td>\n      <td>3.77</td>\n      <td>16.35</td>\n      <td>0.51</td>\n      <td>0.97</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>28806001</td>\n      <td>28808000</td>\n      <td>0.50</td>\n      <td>0.00</td>\n      <td>3.12</td>\n      <td>12.99</td>\n      <td>0.53</td>\n      <td>0.97</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>28808001</td>\n      <td>28810000</td>\n      <td>0.67</td>\n      <td>1.16</td>\n      <td>3.11</td>\n      <td>11.62</td>\n      <td>0.52</td>\n      <td>0.95</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>103001</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Shape : (50721, 12)\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "       timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n33380         66762076       66764075        1.00        0.00        4.88   \n9669          19338286       19340285        0.50        0.71        3.92   \n7697          15394265       15396264        0.33        0.83        0.07   \n8111          16222273       16224272        0.57        0.73       -0.11   \n16857         33714942       33716941        0.33        0.30       -0.60   \n\n       kSQI_score  pSQI_score  basSQI_score  classif  classif_avg  patient  \\\n33380       28.97        0.43          0.90        1          1.0   111001   \n9669        19.29        0.55          0.93        1          1.0   111001   \n7697        -1.41        0.47          0.52        0          0.0   111001   \n8111        -0.11        0.48          0.60        0          0.0   111001   \n16857       19.35        0.46          0.84        1          1.0   111001   \n\n       classif_threshold  \n33380                  1  \n9669                   1  \n7697                   0  \n8111                   0  \n16857                  1  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>patient</th>\n      <th>classif_threshold</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>33380</th>\n      <td>66762076</td>\n      <td>66764075</td>\n      <td>1.00</td>\n      <td>0.00</td>\n      <td>4.88</td>\n      <td>28.97</td>\n      <td>0.43</td>\n      <td>0.90</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>111001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>9669</th>\n      <td>19338286</td>\n      <td>19340285</td>\n      <td>0.50</td>\n      <td>0.71</td>\n      <td>3.92</td>\n      <td>19.29</td>\n      <td>0.55</td>\n      <td>0.93</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>111001</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>7697</th>\n      <td>15394265</td>\n      <td>15396264</td>\n      <td>0.33</td>\n      <td>0.83</td>\n      <td>0.07</td>\n      <td>-1.41</td>\n      <td>0.47</td>\n      <td>0.52</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>111001</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>8111</th>\n      <td>16222273</td>\n      <td>16224272</td>\n      <td>0.57</td>\n      <td>0.73</td>\n      <td>-0.11</td>\n      <td>-0.11</td>\n      <td>0.48</td>\n      <td>0.60</td>\n      <td>0</td>\n      <td>0.0</td>\n      <td>111001</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>16857</th>\n      <td>33714942</td>\n      <td>33716941</td>\n      <td>0.33</td>\n      <td>0.30</td>\n      <td>-0.60</td>\n      <td>19.35</td>\n      <td>0.46</td>\n      <td>0.84</td>\n      <td>1</td>\n      <td>1.0</td>\n      <td>111001</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Shape : (40576, 12)\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "       timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n33380         66762076       66764075        1.00        0.00        4.88   \n9669          19338286       19340285        0.50        0.71        3.92   \n7697          15394265       15396264        0.33        0.83        0.07   \n8111          16222273       16224272        0.57        0.73       -0.11   \n16857         33714942       33716941        0.33        0.30       -0.60   \n\n       kSQI_score  pSQI_score  basSQI_score  classification  \n33380       28.97        0.43          0.90               1  \n9669        19.29        0.55          0.93               1  \n7697        -1.41        0.47          0.52               0  \n8111        -0.11        0.48          0.60               0  \n16857       19.35        0.46          0.84               1  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classification</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>33380</th>\n      <td>66762076</td>\n      <td>66764075</td>\n      <td>1.00</td>\n      <td>0.00</td>\n      <td>4.88</td>\n      <td>28.97</td>\n      <td>0.43</td>\n      <td>0.90</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>9669</th>\n      <td>19338286</td>\n      <td>19340285</td>\n      <td>0.50</td>\n      <td>0.71</td>\n      <td>3.92</td>\n      <td>19.29</td>\n      <td>0.55</td>\n      <td>0.93</td>\n      <td>1</td>\n    </tr>\n    <tr>\n      <th>7697</th>\n      <td>15394265</td>\n      <td>15396264</td>\n      <td>0.33</td>\n      <td>0.83</td>\n      <td>0.07</td>\n      <td>-1.41</td>\n      <td>0.47</td>\n      <td>0.52</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>8111</th>\n      <td>16222273</td>\n      <td>16224272</td>\n      <td>0.57</td>\n      <td>0.73</td>\n      <td>-0.11</td>\n      <td>-0.11</td>\n      <td>0.48</td>\n      <td>0.60</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>16857</th>\n      <td>33714942</td>\n      <td>33716941</td>\n      <td>0.33</td>\n      <td>0.30</td>\n      <td>-0.60</td>\n      <td>19.35</td>\n      <td>0.46</td>\n      <td>0.84</td>\n      <td>1</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Shape : (40576, 9)\n"
     ]
    }
   ],
   "source": [
    "df_ml_conso_validation = df_ml_conso_for_model.drop(['patient', 'classif', 'classif_avg'], axis=1)\n",
    "df_ml_conso_validation.rename(columns={df_ml_conso_validation.columns[-1]: \"classification\"}, inplace=True)\n",
    "\n",
    "for element in  [df_ml_conso, df_ml_conso_for_model, df_ml_conso_validation]:\n",
    "    display(element.head())\n",
    "    print('Shape : {}'.format(element.shape))\n",
    "\n",
    "df_ml_conso_validation.to_csv('../../datasets/3_ml_patients_consolidation/df_ml_conso_validation_norm_2s.csv', index=False)"
   ]
  },
  {
   "source": [
    "# IV - Equalisation of repartition by patient"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient count:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "         classif\npatient         \n103001       820\n111001     25680\n113001      1342\n124001       472",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>820</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>25680</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>1342</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>472</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nPatient average by classification:\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "          classif  classif_avg  classif_threshold\npatient                                          \n103001   0.493902     0.689179                0.5\n111001   0.493341     0.549923                0.5\n113001   0.488077     0.636614                0.5\n124001   0.453390     0.598689                0.5",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>classif</th>\n      <th>classif_avg</th>\n      <th>classif_threshold</th>\n    </tr>\n    <tr>\n      <th>patient</th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>103001</th>\n      <td>0.493902</td>\n      <td>0.689179</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <th>111001</th>\n      <td>0.493341</td>\n      <td>0.549923</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <th>113001</th>\n      <td>0.488077</td>\n      <td>0.636614</td>\n      <td>0.5</td>\n    </tr>\n    <tr>\n      <th>124001</th>\n      <td>0.453390</td>\n      <td>0.598689</td>\n      <td>0.5</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\nProportion of each class\n1    14157\n0    14157\nName: classif_threshold, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "df_ml_conso_balanced = pd.DataFrame()\n",
    "\n",
    "for patient in patients:\n",
    "    df_class1 = df_ml_conso_for_model[(df_ml_conso_for_model['patient'] == patient) & (df_ml_conso_for_model['classif_threshold'] ==1)]\n",
    "    df_class0 = df_ml_conso_for_model[(df_ml_conso_for_model['patient'] == patient) & (df_ml_conso_for_model['classif_threshold'] ==0)]\n",
    "\n",
    "    if df_class1.shape[0] >= df_class0.shape[0]:\n",
    "        df_ml_conso_balanced = pd.concat([df_ml_conso_balanced,\n",
    "                              df_class0,\n",
    "                              df_class1.sample(df_class0.shape[0])]\n",
    "                            )\n",
    "    else:\n",
    "        df_ml_conso_balanced = pd.concat([df_ml_conso_balanced,\n",
    "                        df_class0.sample(df_class1.shape[0]),\n",
    "                        df_class1]\n",
    "                    )\n",
    "\n",
    "print('\\nPatient count:')\n",
    "display(pd.pivot_table(df_ml_conso_balanced, values=['classif'], index='patient',aggfunc='count'))\n",
    "\n",
    "print('\\nPatient average by classification:')\n",
    "display(pd.pivot_table(df_ml_conso_balanced, values=['classif', 'classif_avg', 'classif_threshold'], index='patient',aggfunc=np.mean))\n",
    "\n",
    "print('\\nProportion of each class')\n",
    "print(df_ml_conso_balanced['classif_threshold'].value_counts())"
   ]
  },
  {
   "source": [
    "# V - Export"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "      timestamp_start  timestamp_end  qSQI_score  cSQI_score  sSQI_score  \\\n979          58358143       58360142        0.67        0.59        2.38   \n961          58322141       58324140        0.67        1.09        3.79   \n1185         58770181       58772180        0.33        0.80        1.72   \n90           28980002       28982001        0.50        0.67        2.78   \n682          57764045       57766044        0.80        0.56        4.05   \n\n      kSQI_score  pSQI_score  basSQI_score  classification  \n979        10.16        0.52          0.95               0  \n961        17.60        0.50          0.90               0  \n1185        6.64        0.54          0.88               0  \n90          9.10        0.50          0.95               0  \n682        19.19        0.52          0.98               0  ",
      "text/html": "<div>\n<style scoped>\n    .dataframe tbody tr th:only-of-type {\n        vertical-align: middle;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n\n    .dataframe thead th {\n        text-align: right;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>timestamp_start</th>\n      <th>timestamp_end</th>\n      <th>qSQI_score</th>\n      <th>cSQI_score</th>\n      <th>sSQI_score</th>\n      <th>kSQI_score</th>\n      <th>pSQI_score</th>\n      <th>basSQI_score</th>\n      <th>classification</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>979</th>\n      <td>58358143</td>\n      <td>58360142</td>\n      <td>0.67</td>\n      <td>0.59</td>\n      <td>2.38</td>\n      <td>10.16</td>\n      <td>0.52</td>\n      <td>0.95</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>961</th>\n      <td>58322141</td>\n      <td>58324140</td>\n      <td>0.67</td>\n      <td>1.09</td>\n      <td>3.79</td>\n      <td>17.60</td>\n      <td>0.50</td>\n      <td>0.90</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>1185</th>\n      <td>58770181</td>\n      <td>58772180</td>\n      <td>0.33</td>\n      <td>0.80</td>\n      <td>1.72</td>\n      <td>6.64</td>\n      <td>0.54</td>\n      <td>0.88</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>90</th>\n      <td>28980002</td>\n      <td>28982001</td>\n      <td>0.50</td>\n      <td>0.67</td>\n      <td>2.78</td>\n      <td>9.10</td>\n      <td>0.50</td>\n      <td>0.95</td>\n      <td>0</td>\n    </tr>\n    <tr>\n      <th>682</th>\n      <td>57764045</td>\n      <td>57766044</td>\n      <td>0.80</td>\n      <td>0.56</td>\n      <td>4.05</td>\n      <td>19.19</td>\n      <td>0.52</td>\n      <td>0.98</td>\n      <td>0</td>\n    </tr>\n  </tbody>\n</table>\n</div>"
     },
     "metadata": {}
    }
   ],
   "source": [
    "df_ml_conso_balanced = df_ml_conso_balanced.drop(['patient', 'classif', 'classif_avg'], axis=1)\n",
    "df_ml_conso_balanced.rename(columns={df_ml_conso_balanced.columns[-1]: \"classification\"}, inplace=True)\n",
    "display(df_ml_conso_balanced.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ml_conso_balanced.to_csv('../../datasets/3_ml_patients_consolidation/df_ml_conso_balanced_norm_2s.csv', index=False)"
   ]
  }
 ]
}